library(dplyr)

the.col.names <- c("year", "inc_ul_quin1_yb", "inc_ul_quin2_yb", "inc_ul_quin3_yb", "inc_ul_quin4_yb", "inc_ll_95_yb", "inc_mean_yb", "inc_50_yb")

# Process the actual Canadian data, starting with the 1976-2010 period.
canada.inc.dists <- read.csv("../source_data/ca/cansim Table 202-0601_20150803b.csv", stringsAsFactors=FALSE)
cid <- select(canada.inc.dists, Ref_Date, INCOMEGROUP, Value)
cid$inc_ul <- 0
cid$inc_ll <- 0
cid <- rename(cid, year=Ref_Date, pop_pc=Value)
cid[which(cid$pop_pc == "F"),]$pop_pc <- 0
cid$pop_pc <- as.numeric(cid$pop_pc)
cid[which(cid$INCOMEGROUP == "Under $5,000 (including loss) (percent)"),]$inc_ll <- 0
cid[which(cid$INCOMEGROUP == "Under $5,000 (including loss) (percent)"),]$inc_ul <- 5000
cid[which(cid$INCOMEGROUP == "$5,000 to $9,999 (percent)"),]$inc_ll <- 5000
cid[which(cid$INCOMEGROUP == "$5,000 to $9,999 (percent)"),]$inc_ul <- 10000
cid[which(cid$INCOMEGROUP == "$10,000 to $14,999 (percent)"),]$inc_ll <- 10000
cid[which(cid$INCOMEGROUP == "$10,000 to $14,999 (percent)"),]$inc_ul <- 15000
cid[which(cid$INCOMEGROUP == "$15,000 to $19,999 (percent)"),]$inc_ll <- 15000
cid[which(cid$INCOMEGROUP == "$15,000 to $19,999 (percent)"),]$inc_ul <- 20000
cid[which(cid$INCOMEGROUP == "$20,000 to $24,999 (percent)"),]$inc_ll <- 20000
cid[which(cid$INCOMEGROUP == "$20,000 to $24,999 (percent)"),]$inc_ul <- 25000
cid[which(cid$INCOMEGROUP == "$25,000 to $29,999 (percent)"),]$inc_ll <- 25000
cid[which(cid$INCOMEGROUP == "$25,000 to $29,999 (percent)"),]$inc_ul <- 30000
cid[which(cid$INCOMEGROUP == "$30,000 to $34,999 (percent)"),]$inc_ll <- 30000
cid[which(cid$INCOMEGROUP == "$30,000 to $34,999 (percent)"),]$inc_ul <- 35000
cid[which(cid$INCOMEGROUP == "$35,000 to $39,999 (percent)"),]$inc_ll <- 35000
cid[which(cid$INCOMEGROUP == "$35,000 to $39,999 (percent)"),]$inc_ul <- 40000
cid[which(cid$INCOMEGROUP == "$40,000 to $44,999 (percent)"),]$inc_ll <- 40000
cid[which(cid$INCOMEGROUP == "$40,000 to $44,999 (percent)"),]$inc_ul <- 45000
cid[which(cid$INCOMEGROUP == "$45,000 to $49,999 (percent)"),]$inc_ll <- 45000
cid[which(cid$INCOMEGROUP == "$45,000 to $49,999 (percent)"),]$inc_ul <- 50000
cid[which(cid$INCOMEGROUP == "$50,000 to $54,999 (percent)"),]$inc_ll <- 50000
cid[which(cid$INCOMEGROUP == "$50,000 to $54,999 (percent)"),]$inc_ul <- 55000
cid[which(cid$INCOMEGROUP == "$55,000 to $59,999 (percent)"),]$inc_ll <- 55000
cid[which(cid$INCOMEGROUP == "$55,000 to $59,999 (percent)"),]$inc_ul <- 60000
cid[which(cid$INCOMEGROUP == "$60,000 to $64,999 (percent)"),]$inc_ll <- 60000
cid[which(cid$INCOMEGROUP == "$60,000 to $64,999 (percent)"),]$inc_ul <- 65000
cid[which(cid$INCOMEGROUP == "$65,000 to $69,999 (percent)"),]$inc_ll <- 65000
cid[which(cid$INCOMEGROUP == "$65,000 to $69,999 (percent)"),]$inc_ul <- 70000
cid[which(cid$INCOMEGROUP == "$70,000 to $74,999 (percent)"),]$inc_ll <- 70000
cid[which(cid$INCOMEGROUP == "$70,000 to $74,999 (percent)"),]$inc_ul <- 75000
cid[which(cid$INCOMEGROUP == "$75,000 to $79,999 (percent)"),]$inc_ll <- 75000
cid[which(cid$INCOMEGROUP == "$75,000 to $79,999 (percent)"),]$inc_ul <- 80000
cid[which(cid$INCOMEGROUP == "$80,000 to $84,999 (percent)"),]$inc_ll <- 80000
cid[which(cid$INCOMEGROUP == "$80,000 to $84,999 (percent)"),]$inc_ul <- 85000
cid[which(cid$INCOMEGROUP == "$85,000 to $89,999 (percent)"),]$inc_ll <- 85000
cid[which(cid$INCOMEGROUP == "$85,000 to $89,999 (percent)"),]$inc_ul <- 90000
cid[which(cid$INCOMEGROUP == "$90,000 to $99,999 (percent)"),]$inc_ll <- 90000
cid[which(cid$INCOMEGROUP == "$90,000 to $99,999 (percent)"),]$inc_ul <- 100000
cid[which(cid$INCOMEGROUP == "$100,000 to $124,999 (percent)"),]$inc_ll <- 100000
cid[which(cid$INCOMEGROUP == "$100,000 to $124,999 (percent)"),]$inc_ul <- 125000
cid[which(cid$INCOMEGROUP == "$125,000 to $149,999 (percent)"),]$inc_ll <- 125000
cid[which(cid$INCOMEGROUP == "$125,000 to $149,999 (percent)"),]$inc_ul <- 150000
cid[which(cid$INCOMEGROUP == "$150,000 and over (percent)"),]$inc_ll <- 150000
cid[which(cid$INCOMEGROUP == "Average income (dollars)"),]$inc_ll <- NA
cid[which(cid$INCOMEGROUP == "Average income (dollars)"),]$inc_ul <- NA
cid[which(cid$INCOMEGROUP == "Median income (dollars)"),]$inc_ll <- NA
cid[which(cid$INCOMEGROUP == "Median income (dollars)"),]$inc_ul <- NA
cid <- filter(cid, inc_ll < 150000 | is.na(inc_ll) )
cid <- arrange(cid, year, inc_ll)
bounds <- data.frame()
for (yr in seq(1976, 2011, 1)) {
  inc.dist <- subset(cid, year == yr & !is.na(inc_ll), select=c("inc_ll", "inc_ul", "pop_pc"))
  inc.bounds <- get.pctile.inc.bounds(inc.dist)
  idxs <- which(cid$year == yr & is.na(cid$inc_ul))
  vals <- cid$pop_pc[idxs]
  the.mean <- max(vals)
  the.median <- min(vals)
  new.row <- append(c(yr), append(inc.bounds, c(the.mean, the.median)))
  bounds <- rbind(bounds, new.row)
}
names(bounds) <- the.col.names
write.dta(bounds, "../generated_data/canadaincomes_19762011_processed.dta")

bounds <- data.frame()
for (yr in seq(1976, 2011, 1)) {
  inc.dist <- subset(cid, year == yr & !is.na(inc_ll), select=c("inc_ll", "inc_ul", "pop_pc"))
  inc.bounds <- get.pctile.inc.bounds.by.dist2(inc.dist)
  idxs <- which(cid$year == yr & is.na(cid$inc_ul))
  vals <- cid$pop_pc[idxs]
  the.mean <- max(vals)
  the.median <- min(vals)
  new.row <- append(c(yr), append(inc.bounds, c(the.mean, the.median)))
  bounds <- rbind(bounds, new.row)
}
names(bounds) <- the.col.names
write.dta(bounds, "../generated_data/canadaincomes_19762011_fdist_processed.dta")

canada.inc.dists <- read.csv("../source_data/ca/canada_posttax_distribution_19631975.csv")
bounds <- data.frame()
for (yr in sort(unique(canada.inc.dists$year))) {
  inc.dist <- subset(canada.inc.dists, year == yr & !is.na(inc_ll), select=c("inc_ll", "inc_ul", "pop_pc"))
  inc.bounds <- get.pctile.inc.bounds(inc.dist)
  idxs <- which(canada.inc.dists$year == yr & is.na(canada.inc.dists$inc_ul))
  vals <- canada.inc.dists$pop_pc[idxs]
  the.mean <- max(vals)
  the.median <- min(vals)
  new.row <- append(c(yr), append(inc.bounds, c(the.mean, the.median)))
  bounds <- rbind(bounds, new.row)
}
names(bounds) <- the.col.names
write.dta(bounds, "../generated_data/canadaincomes_19631975_processed.dta")


# Process the actual Canadian data for the year 1963. 1965-1975 comes already-quintiled
# in the yearbooks.
#canada.inc.dists.1963 <- read.csv("../source_data/ca/canada_pretax_distribution_1963.csv")
#bounds <- data.frame()
#for (yr in c(1963)) {
#  inc.dist <- subset(canada.inc.dists.1963, year == yr & !is.na(inc_ll), select=c("inc_ll", "inc_ul", "pop_pc"))
#  inc.bounds <- get.pctile.inc.bounds(inc.dist)
#  idxs <- which(canada.inc.dists.1963$year == yr & is.na(canada.inc.dists.1963$inc_ul))
#  vals <- canada.inc.dists.1963$pop_pc[idxs]
#  the.mean <- max(vals)
#  the.median <- min(vals)
#  new.row <- append(c(yr), append(inc.bounds, c(the.mean, the.median)))
#  bounds <- rbind(bounds, new.row)
#}
#names(bounds) <- the.col.names
#write.dta(bounds, "../generated_data/canadaincomes_1963_processed.dta")

